In [1]:
import random
import pandas
import numpy as np
import tensorflow as tf
from sklearn import metrics, cross_validation
from tensorflow.contrib import skflow
In [2]:
random.seed(42)
In [3]:
data = pandas.read_csv('data/titanic_train.csv')
In [4]:
X = data[["Embarked"]]
In [5]:
y = data[["Survived"]]
In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=42)
In [7]:
embarked_classes = X_train["Embarked"].unique()
In [8]:
print('Embarked has next classes: ', embarked_classes)
In [9]:
cat_processor = skflow.preprocessing.CategoricalProcessor()
In [10]:
X_train = np.array(list(cat_processor.fit_transform(X_train)))
In [11]:
X_test = np.array(list(cat_processor.fit_transform(X_test)))
Categorical Processor로 부터 변수에 대한 클래스의 총 갯수. 변수의 고유한 클래스와 미지정 값도 포함됨.
In [12]:
n_classes = len(cat_processor.vocabularies_[0])
In [13]:
EMBEDDING_SIZE = 3
In [14]:
def categorical_model(X, y):
features = skflow.ops.categorical_variable(
X, n_classes, embedding_size=EMBEDDING_SIZE, name='embarked')
return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)
In [15]:
classifier = skflow.TensorFlowEstimator(model_fn=categorical_model, n_classes=2)
In [16]:
classifier.fit(X_train, y_train['Survived'])
Out[16]:
In [17]:
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
In [18]:
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))
In [19]:
def one_hot_categorical_model(X, y):
features = skflow.ops.one_hot_matrix(X, n_classes)
return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)
In [20]:
classifier = skflow.TensorFlowEstimator(model_fn=one_hot_categorical_model,
n_classes=2, steps=1000, learning_rate=0.01)
In [21]:
classifier.fit(X_train, y_train['Survived'])
Out[21]:
In [22]:
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
In [23]:
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))
In [ ]: